###############################################################
# 1) Male and female graduate enrollment (1967 - 2015)
###############################################################
# Creating a simplified data frame excluding non profit and for profit schools as well as irrelavent totals
grad_enroll_ftf <- grad_enroll %>%
select(Year, full_time_m, full_time_f, part_time_m, part_time_f)
#creating a linear model for male graduate enrollment
male_grad <- lm(grad_enroll$total_males ~ grad_enroll$Year)
summary(male_grad)
##
## Call:
## lm(formula = grad_enroll$total_males ~ grad_enroll$Year)
##
## Residuals:
## Min 1Q Median 3Q Max
## -96461 -34861 -12841 51876 95766
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -17112153 1087024 -15.74 <2e-16 ***
## grad_enroll$Year 9069 546 16.61 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 54050 on 47 degrees of freedom
## Multiple R-squared: 0.8545, Adjusted R-squared: 0.8514
## F-statistic: 276 on 1 and 47 DF, p-value: < 2.2e-16
plot(male_grad)




#creating a linear model for female graduate enrollment
female_grad <- lm(grad_enroll$total_females ~ grad_enroll$Year)
summary(female_grad)
##
## Call:
## lm(formula = grad_enroll$total_females ~ grad_enroll$Year)
##
## Residuals:
## Min 1Q Median 3Q Max
## -89397 -48101 -7633 45267 129727
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.896e+07 1.161e+06 -50.77 <2e-16 ***
## grad_enroll$Year 3.013e+04 5.832e+02 51.66 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 57730 on 47 degrees of freedom
## Multiple R-squared: 0.9827, Adjusted R-squared: 0.9823
## F-statistic: 2669 on 1 and 47 DF, p-value: < 2.2e-16
plot(female_grad)




#create a graph of the models
enroll_graph <- grad_enroll %>%
ggplot(aes(x = Year, y = total_males))+
geom_point(aes(color = "total_males"))+
geom_smooth(aes(x = Year, y = total_males),method = lm, se = TRUE, size = 0.5, color = "gray20") + #plots the linear model with a confidence interval (se)
geom_point(aes(x = Year, y = total_females, color = "total_females")) +
geom_smooth(aes(x = Year, y = total_females),method = lm, se = TRUE, size = 0.5, color = "gray20")+
theme_classic() +
theme(legend.title=element_blank()) +
scale_color_manual(" ", breaks = c("total_males", "total_females"), values = c("total_males" = "royalblue1", "total_females" = "palevioletred1"), labels=c("Males", "Females")) +
labs(x = "\nYear", y = "Total Enrollment\n", title = "Male and Female Graduate Enrollment (1967-2015)\n") +
scale_x_continuous(expand = c(0,0), limits = c(1967, 2015)) +
theme(text = element_text(family = "Times New Roman")) +
theme(plot.title = element_text(hjust = 0.5, size = 13)) +
theme(axis.title = element_text(size = 12), axis.text = element_text(size = 10))
enroll_graph

###############################################################
# 2) Shifts in female PhD recipients by field (1985, 2000, and 2015)
###############################################################
# First creating a data frame with just the 5 fields in question and just the 3 years of interest.
phds_summary <- phds %>%
select("year", "physci_f", "engineer_f", "ed_f", "humart_f") %>%
filter(year == "1985" | year == "2000" | year == "2015") %>%
select("physci_f", "engineer_f", "ed_f", "humart_f") # I realize this line of code is redundant, but just put it in for my own understanding of the order in which things occurred.
rownames(phds_summary) <- c("1985", "2000", "2015")
## Warning: Setting row names on a tibble is deprecated.
#maybe a chisquare tests for proportions of females in each field by year
phds_chi <- chisq.test(phds_summary)
phds_chi
##
## Pearson's Chi-squared test
##
## data: phds_summary
## X-squared = 2073.3, df = 6, p-value < 2.2e-16
phds_prop <- prop.table(as.matrix(phds_summary), 1)
phds_summary2 <- phds %>%
select("year", "physci_f", "engineer_f", "ed_f", "humart_f") %>%
filter(year == "1985" | year == "2000" | year == "2015")
phds_summary3<-melt(phds_summary2, id.vars = 'year')
phds_graph<- phds_summary3 %>%
ggplot(aes(fill = variable, y = value, x = year)) +
geom_bar(stat = "identity", position = "fill") +
scale_x_continuous(breaks = c(1985, 2000, 2015)) +
scale_y_continuous(expand = c(0,0)) +
theme_classic() +
labs(x = "\nYear", y = "Proportion\n", title = "Proportion of Female PhD Recipients\n") +
theme(text = element_text(family = "Times New Roman")) +
theme(plot.title = element_text(hjust = 0.5, size = 13)) +
theme(axis.title = element_text(size = 12), axis.text = element_text(size = 10)) +
scale_fill_manual(values = c("lightpink1", "palevioletred1", "violetred", "maroon4"), labels = c("Physical and Earth Sciences", "Engineering", "Education", "Humanities and Arts")) +
theme(legend.title=element_blank())
phds_graph

# p-value < 0.001 therefore there is a significant association between the year a degree was awarded and the number of PhDs awarded to women in that year (X^2 = 2073, *p* , 0.001)
###############################################################
# 3) Male and female salaries for starting postdoctoral and other employment positions (2015)
###############################################################
#2 mann whitney u tests (one for median postdoc salary male vs female, one for median other employment male vs female)
#explore data for posdoc salaries
male_post_sal <- ggplot(med_salary, aes(x = postdoc_m)) +
geom_histogram(binwidth = 5000, aes(color = "black"))
male_post_sal #not normally distributed

# Checking the qq plot for this distribution
male_post_qq <- ggplot(med_salary, aes(sample = postdoc_m)) +
geom_qq()
male_post_qq

#Looking like it's potentially linear
female_post_sal <- ggplot(med_salary, aes(x = postdoc_f)) +
geom_histogram(binwidth = 5000, aes(color = "black"))
female_post_sal #not normally distributed

# Checking the qq plot for this distribution
female_post_qq <- ggplot(med_salary, aes(sample = postdoc_f)) +
geom_qq()
female_post_qq

# NOT looking linear
#want comparison of means so do Mann Whitney U
#H0: Ranks are equal
#HA: ranks are different
post_sal_test <- wilcox.test(med_salary$postdoc_f, med_salary$postdoc_m, paired = TRUE)
## Warning in wilcox.test.default(med_salary$postdoc_f,
## med_salary$postdoc_m, : cannot compute exact p-value with ties
## Warning in wilcox.test.default(med_salary$postdoc_f,
## med_salary$postdoc_m, : cannot compute exact p-value with zeroes
post_sal_test
##
## Wilcoxon signed rank test with continuity correction
##
## data: med_salary$postdoc_f and med_salary$postdoc_m
## V = 16.5, p-value = 0.8884
## alternative hypothesis: true location shift is not equal to 0
med_sal_2<-melt(med_salary, id.vars = 'field')
post_sal_graph <- med_sal_2 %>%
filter(variable == "postdoc_m" | variable == "postdoc_f") %>%
ggplot(aes(x = field, y = value)) +
geom_col(aes(fill = variable), position = "dodge") +
scale_fill_manual(values = c("royalblue1", "palevioletred1"), labels = c("Male", "Female")) +
theme_classic() +
coord_flip() +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5, size = 8)) +
theme(legend.title=element_blank())+
labs(x = "\nField", y = "Median Salary\n", title = "Male and Female Median Postdoc Salaries 2015\n") +
theme(text = element_text(family = "Times New Roman")) +
theme(plot.title = element_text(hjust = 0.5, size = 13)) +
theme(axis.title = element_text(size = 12)) +
scale_y_continuous(expand = c(0,0)) +
scale_x_discrete(labels = function(x) lapply(strwrap(x, width = 20, simplify = FALSE), paste, collapse="\n"))
post_sal_graph

#explore data for employement salaries
male_employ_sal <- ggplot(med_salary, aes(x = employment_m)) +
geom_histogram(binwidth = 15000)
male_employ_sal #maybe normally distributed?

female_employ_sal <- ggplot(med_salary, aes(x = employment_f)) +
geom_histogram(binwidth = 15000)
female_employ_sal #maybe normally distributed?

employ_sal_test <- wilcox.test(med_salary$employment_f, med_salary$employment_m, paired = TRUE)
## Warning in wilcox.test.default(med_salary$employment_f,
## med_salary$employment_m, : cannot compute exact p-value with ties
## Warning in wilcox.test.default(med_salary$employment_f,
## med_salary$employment_m, : cannot compute exact p-value with zeroes
employ_sal_test
##
## Wilcoxon signed rank test with continuity correction
##
## data: med_salary$employment_f and med_salary$employment_m
## V = 4, p-value = 0.002572
## alternative hypothesis: true location shift is not equal to 0
employ_sal_graph <- med_sal_2 %>%
filter(variable == "employment_m" | variable == "employment_f") %>%
ggplot(aes(x = field, y = value)) +
coord_flip() +
geom_col(aes(fill = variable), position = "dodge") +
scale_fill_manual(values = c("royalblue1", "palevioletred1"), labels = c("Male", "Female")) +
theme_classic() +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5, size = 8)) +
theme(legend.title=element_blank())+
labs(x = "\nField", y = "Median Salary\n", title = "Male and Female Median non Postdoc Salaries 2015\n") +
theme(text = element_text(family = "Times New Roman")) +
theme(plot.title = element_text(hjust = 0.5, size = 13)) +
theme(axis.title = element_text(size = 12)) +
scale_y_continuous(expand = c(0,0)) +
scale_x_discrete(labels = function(x) lapply(strwrap(x, width = 20, simplify = FALSE), paste, collapse="\n"))
employ_sal_graph

###############################################################
# 4) Exploring academic salaries for professors in U.S. colleges
###############################################################
# Find a good multiple linear regression model with the output being salary, and the variables (some combination of) sex, discipline, rank, years of service, years since phd
# Some data exploration
#plot of salary v years service
salary_plot <- ggplot (salary_data, aes(x=salary, y=yrs.service))+
geom_point(aes(color=sex), alpha=0.6)
salary_plot

#plot of salary v years service
salary_plot2 <- ggplot (salary_data, aes(x=salary, y=yrs.since.phd))+
geom_point(aes(color=sex), alpha=0.6)
salary_plot2

#plot of salary by faculty rank
facultyrank_plot <- ggplot (salary_data, aes(x=faculty.rank, y=salary))+
geom_point(aes(color=sex))
facultyrank_plot

# well this doesn't seem to be saying much, not seeing any glaring trends here
# summary table looking at salary by sex and faculty position... interesting? maybe.
avg_salary_sex <- salary_data %>%
group_by(sex, faculty.rank) %>%
summarize(mean= mean(salary),
count=n())
avg_salary_sex
## # A tibble: 6 x 4
## # Groups: sex [?]
## sex faculty.rank mean count
## <chr> <chr> <dbl> <int>
## 1 Female AssocProf 88513. 10
## 2 Female AsstProf 78050. 11
## 3 Female Prof 121968. 18
## 4 Male AssocProf 94870. 54
## 5 Male AsstProf 81311. 56
## 6 Male Prof 127121. 248
# linear model with ALL possible variables
salary_lm1 <- lm (salary ~ sex+faculty.rank+discipline+yrs.since.phd+yrs.service, data=salary_data)
summary(salary_lm1)
##
## Call:
## lm(formula = salary ~ sex + faculty.rank + discipline + yrs.since.phd +
## yrs.service, data = salary_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -65248 -13211 -1775 10384 99592
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 78862.8 4990.3 15.803 < 2e-16 ***
## sexMale 4783.5 3858.7 1.240 0.21584
## faculty.rankAsstProf -12907.6 4145.3 -3.114 0.00198 **
## faculty.rankProf 32158.4 3540.6 9.083 < 2e-16 ***
## disciplineB 14417.6 2342.9 6.154 1.88e-09 ***
## yrs.since.phd 535.1 241.0 2.220 0.02698 *
## yrs.service -489.5 211.9 -2.310 0.02143 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 22540 on 390 degrees of freedom
## Multiple R-squared: 0.4547, Adjusted R-squared: 0.4463
## F-statistic: 54.2 on 6 and 390 DF, p-value: < 2.2e-16
# taking out yrs.since.phd, since it likely pretty much same as years.service; also makes no sense that the years of service would have a negative coefficient
salary_lm2 <- lm(salary ~ sex+faculty.rank+discipline+yrs.since.phd, data=salary_data)
summary(salary_lm2)
##
## Call:
## lm(formula = salary ~ sex + faculty.rank + discipline + yrs.since.phd,
## data = salary_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -67451 -13860 -1549 10716 97023
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 80988.47 4931.84 16.422 < 2e-16 ***
## sexMale 4349.37 3875.39 1.122 0.26242
## faculty.rankAsstProf -13104.15 4167.31 -3.145 0.00179 **
## faculty.rankProf 32928.40 3544.40 9.290 < 2e-16 ***
## disciplineB 13937.47 2346.53 5.940 6.32e-09 ***
## yrs.since.phd 61.01 127.01 0.480 0.63124
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 22660 on 391 degrees of freedom
## Multiple R-squared: 0.4472, Adjusted R-squared: 0.4401
## F-statistic: 63.27 on 5 and 391 DF, p-value: < 2.2e-16
# yrs.service still has a negative coefficient... this is not logical
# taking out faculty.rank (since rank is mostly just determined by how long they are there?)
salary_lm3 <- lm(salary ~ faculty.rank + discipline + yrs.since.phd + sex + sex*faculty.rank, data=salary_data)
summary(salary_lm3)
##
## Call:
## lm(formula = salary ~ faculty.rank + discipline + yrs.since.phd +
## sex + sex * faculty.rank, data = salary_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -67531 -13938 -1215 10765 96921
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 79193.33 7654.84 10.346 < 2e-16 ***
## faculty.rankAsstProf -7848.08 10016.19 -0.784 0.433787
## faculty.rankProf 33599.53 9015.90 3.727 0.000223 ***
## disciplineB 14028.24 2355.32 5.956 5.79e-09 ***
## yrs.since.phd 58.23 127.64 0.456 0.648510
## sexMale 6464.05 7817.86 0.827 0.408839
## faculty.rankAsstProf:sexMale -6308.13 10839.48 -0.582 0.560932
## faculty.rankProf:sexMale -874.01 9603.83 -0.091 0.927535
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 22710 on 389 degrees of freedom
## Multiple R-squared: 0.4478, Adjusted R-squared: 0.4379
## F-statistic: 45.07 on 7 and 389 DF, p-value: < 2.2e-16
plot(salary_lm3)




salary_lm4<- lm (salary~discipline+yrs.service+sex, data= salary_data)
summary(salary_lm4)
##
## Call:
## lm(formula = salary ~ discipline + yrs.service + sex, data = salary_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -77424 -19404 -4635 15539 105391
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 84361.1 4941.4 17.072 < 2e-16 ***
## disciplineB 13033.8 2840.3 4.589 6.01e-06 ***
## yrs.service 832.2 110.2 7.550 3.07e-13 ***
## sexMale 8423.3 4744.5 1.775 0.0766 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 27790 on 393 degrees of freedom
## Multiple R-squared: 0.1646, Adjusted R-squared: 0.1582
## F-statistic: 25.81 on 3 and 393 DF, p-value: 2.928e-15
stargazer(salary_lm2, salary_lm3, salary_lm4, type = "html")
##
## <table style="text-align:center"><tr><td colspan="4" style="border-bottom: 1px solid black"></td></tr><tr><td style="text-align:left"></td><td colspan="3"><em>Dependent variable:</em></td></tr>
## <tr><td></td><td colspan="3" style="border-bottom: 1px solid black"></td></tr>
## <tr><td style="text-align:left"></td><td colspan="3">salary</td></tr>
## <tr><td style="text-align:left"></td><td>(1)</td><td>(2)</td><td>(3)</td></tr>
## <tr><td colspan="4" style="border-bottom: 1px solid black"></td></tr><tr><td style="text-align:left">sexMale</td><td>4,349.366</td><td>6,464.051</td><td>8,423.335<sup>*</sup></td></tr>
## <tr><td style="text-align:left"></td><td>(3,875.393)</td><td>(7,817.858)</td><td>(4,744.537)</td></tr>
## <tr><td style="text-align:left"></td><td></td><td></td><td></td></tr>
## <tr><td style="text-align:left">faculty.rankAsstProf:sexMale</td><td></td><td>-6,308.131</td><td></td></tr>
## <tr><td style="text-align:left"></td><td></td><td>(10,839.480)</td><td></td></tr>
## <tr><td style="text-align:left"></td><td></td><td></td><td></td></tr>
## <tr><td style="text-align:left">faculty.rankProf:sexMale</td><td></td><td>-874.006</td><td></td></tr>
## <tr><td style="text-align:left"></td><td></td><td>(9,603.828)</td><td></td></tr>
## <tr><td style="text-align:left"></td><td></td><td></td><td></td></tr>
## <tr><td style="text-align:left">faculty.rankAsstProf</td><td>-13,104.150<sup>***</sup></td><td>-7,848.084</td><td></td></tr>
## <tr><td style="text-align:left"></td><td>(4,167.315)</td><td>(10,016.190)</td><td></td></tr>
## <tr><td style="text-align:left"></td><td></td><td></td><td></td></tr>
## <tr><td style="text-align:left">faculty.rankProf</td><td>32,928.400<sup>***</sup></td><td>33,599.530<sup>***</sup></td><td></td></tr>
## <tr><td style="text-align:left"></td><td>(3,544.403)</td><td>(9,015.904)</td><td></td></tr>
## <tr><td style="text-align:left"></td><td></td><td></td><td></td></tr>
## <tr><td style="text-align:left">disciplineB</td><td>13,937.470<sup>***</sup></td><td>14,028.240<sup>***</sup></td><td>13,033.850<sup>***</sup></td></tr>
## <tr><td style="text-align:left"></td><td>(2,346.534)</td><td>(2,355.315)</td><td>(2,840.349)</td></tr>
## <tr><td style="text-align:left"></td><td></td><td></td><td></td></tr>
## <tr><td style="text-align:left">yrs.since.phd</td><td>61.011</td><td>58.228</td><td></td></tr>
## <tr><td style="text-align:left"></td><td>(127.010)</td><td>(127.640)</td><td></td></tr>
## <tr><td style="text-align:left"></td><td></td><td></td><td></td></tr>
## <tr><td style="text-align:left">yrs.service</td><td></td><td></td><td>832.154<sup>***</sup></td></tr>
## <tr><td style="text-align:left"></td><td></td><td></td><td>(110.215)</td></tr>
## <tr><td style="text-align:left"></td><td></td><td></td><td></td></tr>
## <tr><td style="text-align:left">Constant</td><td>80,988.470<sup>***</sup></td><td>79,193.330<sup>***</sup></td><td>84,361.070<sup>***</sup></td></tr>
## <tr><td style="text-align:left"></td><td>(4,931.844)</td><td>(7,654.839)</td><td>(4,941.371)</td></tr>
## <tr><td style="text-align:left"></td><td></td><td></td><td></td></tr>
## <tr><td colspan="4" style="border-bottom: 1px solid black"></td></tr><tr><td style="text-align:left">Observations</td><td>397</td><td>397</td><td>397</td></tr>
## <tr><td style="text-align:left">R<sup>2</sup></td><td>0.447</td><td>0.448</td><td>0.165</td></tr>
## <tr><td style="text-align:left">Adjusted R<sup>2</sup></td><td>0.440</td><td>0.438</td><td>0.158</td></tr>
## <tr><td style="text-align:left">Residual Std. Error</td><td>22,663.240 (df = 391)</td><td>22,708.750 (df = 389)</td><td>27,789.810 (df = 393)</td></tr>
## <tr><td style="text-align:left">F Statistic</td><td>63.266<sup>***</sup> (df = 5; 391)</td><td>45.071<sup>***</sup> (df = 7; 389)</td><td>25.810<sup>***</sup> (df = 3; 393)</td></tr>
## <tr><td colspan="4" style="border-bottom: 1px solid black"></td></tr><tr><td style="text-align:left"><em>Note:</em></td><td colspan="3" style="text-align:right"><sup>*</sup>p<0.1; <sup>**</sup>p<0.05; <sup>***</sup>p<0.01</td></tr>
## </table>